#!/usr/bin/Rscript
##########################################################################################
# Social Media and Policy Responses to the COVID-19 Pandemic in Switzerland
##########################################################################################
# Description:
##########################################################################################
# Preparation of Twitter Data 
##########################################################################################
# Contents
##########################################################################################
# 1) Dependencies
# 2) Preparations
# 3) Set Variable Names Needed
# 4) Load Data 
# 5) Sentiment
# 6) Topic allocation
# 7) Save curated data
##########################################################################################
# 1) Dependencies
##########################################################################################
suppressPackageStartupMessages(library(tools))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(readr))
suppressPackageStartupMessages(library(quanteda))
suppressPackageStartupMessages(library(quantreg))
suppressPackageStartupMessages(library(magrittr))
suppressPackageStartupMessages(library(purrr))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(glue))
suppressPackageStartupMessages(library(pbmcapply))
suppressPackageStartupMessages(library(urltools))
suppressPackageStartupMessages(library(cld2))
suppressPackageStartupMessages(library(rtweet))
##########################################################################################
# 2) Preparations
##########################################################################################
rm(list=ls())
# - set dir
args = commandArgs()

scriptName = args[substr(args,1,7) == '--file=']

if (length(scriptName) == 0) {
  scriptName <- rstudioapi::getSourceEditorContext()$path
} else {
  scriptName <- substr(scriptName, 8, nchar(scriptName))
}

pathName = substr(
  scriptName, 
  1, 
  nchar(scriptName) - nchar(strsplit(scriptName, '.*[/|\\]')[[1]][2])
)

setwd(pathName)
parent_path <- getwd()
##########################################################################################
# 3) Set Variable Names Needed
##########################################################################################
###### -------- Define File Names -------- ######
twitter_data <- "Twitter_data"
##########################################################################################
# 4) Load Data 
##########################################################################################
# - load Data
df_raw <- readRDS(paste0("../data/",twitter_data, ".RDS"))

# - remove old topic vars and sentiment vars
df_raw <- df_raw %>% dplyr::select(-c(sentiment_value,topic,positive_words,negative_words))

gc()
# - get idea of scope
df_raw$Datum <-  as.Date(df_raw$Datum)
range(df_raw$Datum)


# Load Function Data:
# Load Dictionary from Proksch et al. Multilingual Sentiment analysis (based on the Lexicoder Dictionary)
load(paste0("../lib/auto_dictionaries_lsd.RData"))
dictionaries <- load(paste0("../lib/auto_dictionaries_lsd.RData"))

# Load improved Lexicons
load(paste0("../lib/lsde_frenche_germane.RData"))
dictionaries_2 <- load(paste0("../lib/lsde_frenche_germane.RData"))

#Combine Vector
dictionaries <- c(dictionaries, dictionaries_2)
rm(dictionaries_2)
gc()
##########################################################################################
# 5) Sentiment
##########################################################################################
# Text Cleaner Function:
clean_text <- function(clean_text){
  # Get Sentences in order:
  tx_new <- gsub('(?<=[a-z])\\.(?=[A-Z\\(])', '. ', clean_text, perl = T)
  # Remove URL's
  tx_new <- gsub('http\\S+\\s*', '', tx_new, perl = T)
  # Remove Numbers:
  tx_new <- gsub('http[[:alnum:]]*', '', tx_new, perl = T)
  # Remove leading and trailing whitespaces:
  tx_new <- gsub('^[[:space:]]*', '', tx_new, perl = T)
  tx_new <- gsub('[[:space:]]*$', '', tx_new, perl = T)
  # Remove weired quotation marks and other signs quanteda does not like:
  tx_new <- gsub('|\\»|\\«|\\}|\\{', '', tx_new)
  tx_new <- gsub('\\–', '. ', tx_new)
  tx_new <- gsub('\\{|\\}|\\[|\\]|\\(|\\)', '', tx_new)
  tx_new <- tolower(tx_new)
  
  clean_text <- as.character(tx_new)
}

df_raw$text <- clean_text(df_raw$Text)

# Add language to Tweet 
# (this is not the final lanuage coding but just a helper for the sentiment)
text_df <- df_raw$text
len_l <- length(text_df)
gc()
la <- pbmclapply(1:len_l, function(l){
  lang <- cld2::detect_language_mixed(text_df[l])
  lang <- lang[[1]]
  
  if(lang$code[1] %in% c("de","fr","it","en")){
    lang <- lang$code[1]
  } else if (lang$code[2] %in% c("de","fr","it","en")) {
    lang <- lang$code[2]
  } else if (lang$code[3] %in% c("de","fr","it","en")) {
    lang <- lang$code[3]
  } else {
    lang <- "de"
  }
}, mc.cores = 2)
la <- unlist(la)
df_raw$la <- la
rm(len_l, text_df, la)
gc()

# Show Language File Count:
df_raw %>% group_by(la) %>% summarise(n = n())

# Split large Data in smaller chunks:
nr <- nrow(df_raw)
n <- 100000
dflist <- split(df_raw, rep(1:ceiling(nr/n), each=n, length.out=nr))
df_raw <- data.frame()
gc()

#Sentiment Calculator:
final_dat <- function(data, dictionarieslist = dictionaries, quant_nodes = 4, nodes = 4){
  quanteda_options(threads = quant_nodes)
  #########################################################
  # Add Sentiment Column:
  data$sentiment_value <- NA
  data$positive_words <- NA
  data$negative_words <- NA
  cat("Make sure you have loaded the 'auto_dictionaries_lsd.RData' and 'lsde_frenche_germane.RData' in the environment.\nWithout them the function will not work\n")
  # Split data by Languages:
  data_de <- data[data$la == "de", ]
  data_fr <- data[data$la == "fr", ]
  data_it <- data[data$la == "it", ]
  data_en <- data[data$la == "en", ]
  #Clear Mem:
  rm(data)
  gc()
  
  cat("Prepocessing of Tweets done! Starting Sentiment Analysis!\n")
  #########################################################
  # Allocate Memory
  numde <- nrow(data_de)
  numfr <- nrow(data_fr)
  numit <- nrow(data_it)
  numen <- nrow(data_en)
  ################
  # Process German
  protex <- corpus(data_de[,'text'])
  
  dict_lang_pos_neg <-  get(paste0("extendeddict_de"))
  dict_lang_pos <- dict_lang_pos_neg[[1]]
  dict_lang_neg <- dict_lang_pos_neg[[2]]
  
  #Calculate Sentiment of Text: 
  senti_words_pos_pre <- convert(dfm(protex, select = dict_lang_pos, verbose = FALSE), to = "data.frame")
  senti_words_pos_pre <- senti_words_pos_pre %>% gather(Word,Occurences, 2:ncol(senti_words_pos_pre)) %>% 
    mutate(document = as.numeric(gsub("\\D+", "", doc_id))) %>% 
    group_by(document) %>%
    summarize(Pos_Words = paste(rep(Word, Occurences), sep ="", collapse = " "),
              Count_Pos_Words = sum(Occurences))
  
  senti_words_neg_pre <- convert(dfm(protex, select = dict_lang_neg, verbose = FALSE), to = "data.frame")
  senti_words_neg_pre <- senti_words_neg_pre %>% gather(Word,Occurences, 2:ncol(senti_words_neg_pre)) %>% 
    mutate(document = as.numeric(gsub("\\D+", "", doc_id))) %>% 
    group_by(document) %>%
    summarize(Neg_Words = paste(rep(Word, Occurences), collapse = " "),
              Count_Neg_Words = sum(Occurences))
  
  sentisave <- left_join(senti_words_pos_pre, senti_words_neg_pre, by = "document")
  sentisave$value <- as.numeric(log((sentisave$Count_Pos_Words+0.5)/(sentisave$Count_Neg_Words+0.5)))
  
  #Add Value to Data     
  data_de$sentiment_value <- sentisave$value
  data_de$positive_words <-  sentisave$Pos_Words
  data_de$negative_words <-  sentisave$Neg_Words
  rm(senti_words_neg_pre, senti_words_pos_pre, sentisave)
  ################                 
  # Process French
  protex <- corpus(data_fr[,'text'])
  
  dict_lang_pos_neg <-  get(paste0("extendeddict_fr"))
  dict_lang_pos <- dict_lang_pos_neg[[1]]
  dict_lang_neg <- dict_lang_pos_neg[[2]]
  
  #Calculate Sentiment of Text: 
  senti_words_pos_pre <- convert(dfm(protex, select = dict_lang_pos, verbose = FALSE), to = "data.frame")
  senti_words_pos_pre <- senti_words_pos_pre %>% gather(Word,Occurences, 2:ncol(senti_words_pos_pre)) %>% 
    mutate(document = as.numeric(gsub("\\D+", "", doc_id))) %>% 
    group_by(document) %>%
    summarize(Pos_Words = paste(rep(Word, Occurences), collapse = " "),
              Count_Pos_Words = sum(Occurences))
  
  senti_words_neg_pre <- convert(dfm(protex, select = dict_lang_neg, verbose = FALSE), to = "data.frame")
  senti_words_neg_pre <- senti_words_neg_pre %>% gather(Word,Occurences, 2:ncol(senti_words_neg_pre)) %>% 
    mutate(document = as.numeric(gsub("\\D+", "", doc_id))) %>% 
    group_by(document) %>%
    summarize(Neg_Words = paste(rep(Word, Occurences), collapse = " "),
              Count_Neg_Words = sum(Occurences))
  
  sentisave <- left_join(senti_words_pos_pre, senti_words_neg_pre, by = "document")
  sentisave$value <- as.numeric(log((sentisave$Count_Pos_Words+0.5)/(sentisave$Count_Neg_Words+0.5)))
  
  #Add Value to Data     
  data_fr$sentiment_value <- sentisave$value
  data_fr$positive_words <-  sentisave$Pos_Words
  data_fr$negative_words <-  sentisave$Neg_Words
  rm(senti_words_neg_pre, senti_words_pos_pre, sentisave)
  ################                 
  # Process Italian
  protex <- corpus(data_it[,'text'])
  
  dict_lang_pos_neg <-  get(paste0("extendeddict_it"))
  dict_lang_pos <- dict_lang_pos_neg[[1]]
  dict_lang_neg <- dict_lang_pos_neg[[2]]
  
  #Calculate Sentiment of Text: 
  senti_words_pos_pre <- convert(dfm(protex, select = dict_lang_pos, verbose = FALSE), to = "data.frame")
  senti_words_pos_pre <- senti_words_pos_pre %>% gather(Word,Occurences, 2:ncol(senti_words_pos_pre)) %>% 
    mutate(document = as.numeric(gsub("\\D+", "", doc_id))) %>% 
    group_by(document) %>%
    summarize(Pos_Words = paste(rep(Word, Occurences), collapse = " "),
              Count_Pos_Words = sum(Occurences))
  
  senti_words_neg_pre <- convert(dfm(protex, select = dict_lang_neg, verbose = FALSE), to = "data.frame")
  senti_words_neg_pre <- senti_words_neg_pre %>% gather(Word,Occurences, 2:ncol(senti_words_neg_pre)) %>% 
    mutate(document = as.numeric(gsub("\\D+", "", doc_id))) %>% 
    group_by(document) %>%
    summarize(Neg_Words = paste(rep(Word, Occurences), collapse = " "),
              Count_Neg_Words = sum(Occurences))
  
  sentisave <- left_join(senti_words_pos_pre, senti_words_neg_pre, by = "document")
  sentisave$value <- as.numeric(log((sentisave$Count_Pos_Words+0.5)/(sentisave$Count_Neg_Words+0.5)))
  
  #Add Value to Data     
  data_it$sentiment_value <- sentisave$value
  data_it$positive_words <-  sentisave$Pos_Words
  data_it$negative_words <-  sentisave$Neg_Words
  rm(senti_words_neg_pre, senti_words_pos_pre, sentisave)
  ################                 
  # Process English
  protex <- corpus(data_en[,'text'])
  
  dict_lang_pos_neg <-  get(paste0("extendeddict_en"))
  dict_lang_pos <- dict_lang_pos_neg[[1]]
  dict_lang_neg <- dict_lang_pos_neg[[2]]
  
  #Calculate Sentiment of Text: 
  senti_words_pos_pre <- convert(dfm(protex, select = dict_lang_pos, verbose = FALSE), to = "data.frame")
  senti_words_pos_pre <- senti_words_pos_pre %>% gather(Word,Occurences, 2:ncol(senti_words_pos_pre)) %>% 
    mutate(document = as.numeric(gsub("\\D+", "", document))) %>% 
    group_by(document) %>%
    summarize(Pos_Words = paste(rep(Word, Occurences), collapse = " "),
              Count_Pos_Words = sum(Occurences))
  
  senti_words_neg_pre <- convert(dfm(protex, select = dict_lang_neg, verbose = FALSE), to = "data.frame")
  senti_words_neg_pre <- senti_words_neg_pre %>% gather(Word,Occurences, 2:ncol(senti_words_pos_pre)) %>% 
    mutate(document = as.numeric(gsub("\\D+", "", document))) %>% 
    group_by(document) %>%
    summarize(Neg_Words = paste(rep(Word, Occurences), collapse = " "),
              Count_Neg_Words = sum(Occurences))
  
  sentisave <- left_join(senti_words_pos_pre, senti_words_neg_pre, by = "document")
  sentisave$value <- as.numeric(log((sentisave$Count_Pos_Words+0.5)/(sentisave$Count_Neg_Words+0.5)))
  
  #Add Value to Data     
  data_en$sentiment_value <- sentisave$value
  data_en$positive_words <-  sentisave$Pos_Words
  data_en$negative_words <-  sentisave$Neg_Words
  rm(senti_words_neg_pre, senti_words_pos_pre, sentisave)
  ################
  # Combine all frames again:
  data <- rbind(data_de,data_fr,data_it,data_en)
  rm(data_de,data_fr,data_it,data_en)
  gc()
  return(data)
}

for(t in 1:length(dflist)){
  tmp <- final_dat(dflist[[t]], quant_nodes = 4) 
  if(t == 1){
    df_raw <- tmp
    rm(tmp)
  } else {
    df_raw <- rbind(df_raw,tmp)
    rm(tmp)
  }
  print(paste0(t," th Element of list processed!\n"))
  gc()
}

df_raw$sentiment_value <- as.numeric(df_raw$sentiment_value)
rm(dflist)
gc()
saveRDS(df_raw, "../data/Twitter_data.RDS")
##########################################################################################
# 6) Topic allocation
##########################################################################################
df_raw <- readRDS("../data/Twitter_data.RDS")
search_pattern_covid <- paste0(
  rep("[[:punct:]]*[a-zA-Z0-9]*"),
  c('corona','covid19','coronaschweiz','coronach','coronavirus','coronavirusschweiz','epidemie',
    'social distancing', 'coronatests', 'pandemie', 'corona-pandemie',
    'coronakrise','covid19ch','covidch','bag_ofsp_ufsp','coronainfoch', 'swisscovid',
    'pandemie', 'covid', 'coronakrise','swiss-covid-app', 'coronapandemie',
    'corona-sommer', 'covid-19-erkrankungen', 'corona-kredit', 'corona-infektionen',
    'lockdown', 'schutzmaske','beatmungsgerät', 'beatmungsgeräte','pandémie',
    'masques', 'crise sanitaire', 'covid-19', 'sars-cov-2', 'coronagraben', 'swisscovid',
    'coronavirus', 'covid', 'epidémie', 'social distancing', 'garder ses distances',
    'maske','contact tracing', 'masquer', 'maschera', 'respirator', 'hygienemaske', 'ffp2', 
    'atemschutz', 'swisscovid','covidioten','neuinfektionen','hospitalisierungsrate',
    'covidapp','coronaapp','swiss-covid-app', 'contact-tracing-app','dp-3t','swisscovidapp',
    'epidemiologisch','antikörper', 'maskenpflicht','maskenzwang','maskenwahn','herdenimunität',
    'coronawarnapp', 'contact-tracing', 'contact tracing', 'besondere lage', 'ausserordendliche lage',
    'swisscovid-app', 'corona-app','covid-codes','corona app', 'corona warn app',' contact tracing app', 
    'kontakt verfolgungs app', 'kontakt rückverfolgung'), 
  rep("[a-zA-Z0-9]*")
) %>% paste(collapse = "|")

df_txt_a <- df_raw$Text
len_l <- length(df_txt_a)
gc()
for(k in 1:ceiling(len_l/100000)){
  if(k == 1){
    min_k <- 1
    max_k <- 100000
  } else {
    min_k <- max_k + 1
    max_k <- max_k + 100000
  }
  if(max_k > len_l){max_k <- len_l}
  df_pr_l <- pbmclapply(min_k:max_k, function(l){
    df_txt <- df_txt_a[l]
    df_txt <- tolower(df_txt)
    df_txt <- gsub("#", "", df_txt)
    df_res <- as.numeric(str_detect(df_txt, search_pattern_covid))
  },mc.cores = 4)
  
  df_pr_l <- unlist(df_pr_l)
  if(k == 1){
    text_tmp <- df_pr_l
  } else {
    text_tmp <- c(text_tmp, df_pr_l)
  }
  gc() 
}
df_raw$iscovid19_txt <- text_tmp
df_raw$topic <- ifelse(df_raw$iscovid19_txt == 1, "COVID19", "Anderes")
rm(len_l,text_tmp,df_txt_a,min_k,max_k,df_pr_l,k)
saveRDS(df_raw, "../data/Twitter_data.RDS")

# Mask Topic:
search_pattern_mask <- paste0(
  rep("[[:punct:]]*[a-zA-Z0-9]*"),
  c('schutzmaske', 'masques', 'maske','masquer', 'maschera', 'hygienemaske', 'ffp2', 'mascherina', 'masken', 'atemschutzmasken',
    'atemschutz', 'maskenpflicht','maskenzwang','maskenwahn', 'mundnasenschutz', 'mund-nasen-schutz', 'gesichtsschutz', 'masque'), 
  rep("[a-zA-Z0-9]*")
) %>% paste(collapse = "|")

df_txt_a <- df_raw$Text
len_l <- length(df_txt_a)
gc()
for(k in 1:ceiling(len_l/100000)){
  if(k == 1){
    min_k <- 1
    max_k <- 100000
  } else {
    min_k <- max_k + 1
    max_k <- max_k + 100000
  }
  if(max_k > len_l){max_k <- len_l}
  df_pr_l <- pbmclapply(min_k:max_k, function(l){
    df_txt <- df_txt_a[l]
    df_txt <- tolower(df_txt)
    df_txt <- gsub("#", "", df_txt)
    df_res <- as.numeric(str_detect(df_txt, search_pattern_mask))
  },mc.cores = 3)
  
  df_pr_l <- unlist(df_pr_l)
  if(k == 1){
    text_tmp <- df_pr_l
  } else {
    text_tmp <- c(text_tmp, df_pr_l)
  }
  gc() 
}
df_raw$ismask_txt <- text_tmp
df_raw$topic_1 <- ifelse(df_raw$ismask_txt == 1, "Mask", "Anderes")
rm(len_l,text_tmp,df_txt_a,min_k,max_k,df_pr_l,k)
saveRDS(df_raw, "../data/Twitter_data.RDS")

# CovidApp Topic:
search_pattern_covidapp <- paste0(
  rep("[[:punct:]]*[a-zA-Z0-9]*"),
  c('swiss-covid-app','covidapp','coronaapp','swiss covid app', 'contact-tracing-app','dp-3t','swisscovidapp','swisscovid-app', 
    'corona-app','covid-codes', 'contact tracing', 'swiss-covid-app',
    'coronawarnapp', 'corona app', 'corona warn app', 'contact tracing app', 'kontaktverfolgungsapp', 'kontakt-verfolgungs-app', 
    'swisscovid', 'covidcodes', 'covid codes', 'corona app', 'dp3t', 'dp 3t'), 
  rep("[a-zA-Z0-9]*")
) %>% paste(collapse = "|")

df_txt_a <- df_raw$Text
len_l <- length(df_txt_a)
gc()
for(k in 1:ceiling(len_l/100000)){
  if(k == 1){
    min_k <- 1
    max_k <- 100000
  } else {
    min_k <- max_k + 1
    max_k <- max_k + 100000
  }
  if(max_k > len_l){max_k <- len_l}
  df_pr_l <- pbmclapply(min_k:max_k, function(l){
    df_txt <- df_txt_a[l]
    df_txt <- tolower(df_txt)
    df_txt <- gsub("#", "", df_txt)
    df_res <- as.numeric(str_detect(df_txt, search_pattern_covidapp))
  },mc.cores = 6)
  
  df_pr_l <- unlist(df_pr_l)
  if(k == 1){
    text_tmp <- df_pr_l
  } else {
    text_tmp <- c(text_tmp, df_pr_l)
  }
  gc() 
}
df_raw$iscovidapp_txt <- text_tmp
df_raw$topic_2 <- ifelse(df_raw$iscovidapp_txt == 1, "CovidApp", "Anderes")
rm(len_l,text_tmp,df_txt_a,min_k,max_k,df_pr_l,k)
saveRDS(df_raw, "../data/Twitter_data.RDS")

search_pattern_covidapp_2 <- paste0(
  rep("[[:punct:]]*[a-zA-Z0-9]*"),
  c('swiss-covid-app','covidapp','coronaapp','swiss covid app', 'contact-tracing-app','dp-3t','swisscovidapp','swisscovid-app', 
    'corona-app','covid-codes', 'contact tracing', 'swiss-covid-app', 
    'coronawarnapp', 'corona app', 'corona warn app', 'contact tracing app', 'kontaktverfolgungsapp', 'kontakt-verfolgungs-app', 
    'swisscovid', 'covidcodes', 'covid codes', 'corona app', 'dp3t', 'dp 3t'), 
  rep("[a-zA-Z0-9]*")
) %>% paste(collapse = "|")

search_pattern_covidapp_2 <- paste(search_pattern_covidapp_2, "|\\bapp\\b|\\bapps\\b")

df_txt_a <- df_raw$Text
len_l <- length(df_txt_a)
gc()
for(k in 1:ceiling(len_l/100000)){
  if(k == 1){
    min_k <- 1
    max_k <- 100000
  } else {
    min_k <- max_k + 1
    max_k <- max_k + 100000
  }
  if(max_k > len_l){max_k <- len_l}
  df_pr_l <- pbmclapply(min_k:max_k, function(l){
    df_txt <- df_txt_a[l]
    df_txt <- tolower(df_txt)
    df_txt <- gsub("#", "", df_txt)
    df_res <- as.numeric(str_detect(df_txt, search_pattern_covidapp_2))
  },mc.cores = 6)
  
  df_pr_l <- unlist(df_pr_l)
  if(k == 1){
    text_tmp <- df_pr_l
  } else {
    text_tmp <- c(text_tmp, df_pr_l)
  }
  gc() 
}
df_raw$iscovidapp_txt_2 <- text_tmp
df_raw$topic_2_2 <- ifelse(df_raw$iscovid19_txt == 0, "Anderes", 
                           ifelse(df_raw$iscovidapp_txt_2 == 1, "CovidApp", "Anderes"))
rm(len_l,text_tmp,df_txt_a,min_k,max_k,df_pr_l,k)
saveRDS(df_raw, "../data/Twitter_data.RDS")


df_raw$topic_0 <- df_raw$topic

df_raw$topic <- ifelse(df_raw$iscovidapp_txt == 1 & df_raw$topic_2_2 == "CovidApp", "App & Masks", 
                       ifelse(df_raw$topic_2_2 == "CovidApp", "App", 
                              ifelse(df_raw$ismask_txt == 1, "Masks",
                                     ifelse(df_raw$iscovid19_txt == 1, "Covid19", "Anderes"))))

df_raw %>% group_by(topic) %>% summarise(n = n())
saveRDS(df_raw, "../data/Twitter_data.RDS")

# Remove spanish and portuges articles... somehow we have a lot of them for some reason (Brazil <- BDP)
text_df <- df_raw$Text
len_l <- length(text_df)
gc()
la <- pbmclapply(1:len_l, function(l){
  lang <- cld2::detect_language_mixed(text_df[l])
  lang <- lang[[1]]
  
  if(lang$code[1] %in% c("de","fr","it","en")){
    lang <- lang$code[1]
  } else if (lang$code[2] %in% c("de","fr","it","en")) {
    lang <- lang$code[2]
  } else if (lang$code[3] %in% c("de","fr","it","en")) {
    lang <- lang$code[3]
  } else if (lang$code[1] %in% c("es", "pt", "hi", "ja", "nl", "pl", "ro", "ru")) {
    lang <- lang$code[1]
  } else if (lang$code[2] %in% c("es", "pt", "hi", "ja", "nl", "pl", "ro", "ru")) {
    lang <- lang$code[2]
  } else if (lang$code[3] %in% c("es", "pt", "hi", "ja", "nl", "pl", "ro", "ru")) {
    lang <- lang$code[3]
  } else {
    lang <- "de"
  }
  
}, mc.cores = 2)
la <- unlist(la)
df_raw$la <- la
rm(la,len_l,text_df)
##########################################################################################
# 7) Save curated data
##########################################################################################
df_raw <- df_raw %>% dplyr::rename(topic_covid = topic_0, topic_mask = topic_1, topic_app = topic_2_2, topic_app_small = topic_2)

saveRDS(df_raw, "../data/Twitter_data.RDS")
rm(df_raw, extendeddict_bg, extendeddict_cs, extendeddict_da, extendeddict_el, 
   extendeddict_es, extendeddict_et, extendeddict_fi, extendeddict_hu, extendeddict_lt,
   extendeddict_lv, extendeddict_nl, extendeddict_pl, extendeddict_pt, extendeddict_ro,
   extendeddict_sk, extendeddict_sl, extendeddict_sv)
gc()


df_raw <- df_raw %>% dplyr::select(c("User_id","Status_id","Verified","Quoted_verified","Retweet_verified",
                                     "Lang","Screen_name","Quoted_screen_name","Retweet_screen_name",
                                     "Is_quote","Is_retweet","Retweet_user_id","Quoted_user_id",
                                     "Reply_to_screen_name","Reply_to_status_id","Reply_to_user_id",
                                     "Datum","Datum_full","Hashtags","Last_Name","First_Name","Name",
                                     "Akteur","Kürzel","Akteur.Typ","Year.of.Birth","Gender","Party","Party_Short",
                                     "Canton","Municipality","Zip","la","sentiment_value",
                                     "is_covid19","is_covid19_hst","is_covid19_txt","iscovid19_txt","iscovidapp_txt",
                                     "iscovidapp_txt_2","ismask_txt","topic","topic_app","topic_app_small","topic_covid",
                                     "topic_mask","Text"))

# Get Language of Tweet again since this has not been done perfectly before:
text_df <- df_raw$Text
df_raw$Text <- NULL
len_l <- length(text_df)
gc()
la <- pbmclapply(1:len_l, function(l){
  lang <- cld2::detect_language_mixed(text_df[l])
  lang <- lang[[1]]
  
  if(lang$code[1] %in% c("de","fr","it","en")){
    lang <- lang$code[1]
  } else if (lang$code[2] %in% c("de","fr","it","en")) {
    lang <- lang$code[2]
  } else if (lang$code[3] %in% c("de","fr","it","en")) {
    lang <- lang$code[3]
  } else {
    lang <- lang$code[1]
  }
}, mc.cores = 2)
la <- unlist(la)
df_raw$la <- la
rm(len_l, text_df, la)
gc()

saveRDS(df_raw, "../data/Twitter_data_minified.RDS")
